Requirement already satisfied: requests in c:\users\shuos\anaconda3\lib\site-packages (2.32.2)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (2024.8.30)
Note: you may need to restart the kernel to use updated packages.
In [2]:
pip install html5lib
Collecting html5libNote: you may need to restart the kernel to use updated packages.
Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Requirement already satisfied: six>=1.9 in c:\users\shuos\anaconda3\lib\site-packages (from html5lib) (1.16.0)
Requirement already satisfied: webencodings in c:\users\shuos\anaconda3\lib\site-packages (from html5lib) (0.5.1)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
---------------------------------------- 0.0/112.2 kB ? eta -:--:--
--- ------------------------------------ 10.2/112.2 kB ? eta -:--:--
------------------------ -------------- 71.7/112.2 kB 975.2 kB/s eta 0:00:01
-------------------------------------- 112.2/112.2 kB 926.7 kB/s eta 0:00:00
Installing collected packages: html5lib
Successfully installed html5lib-1.1
In [3]:
pip install bs4
Requirement already satisfied: bs4 in c:\users\shuos\anaconda3\lib\site-packages (0.0.2)
Requirement already satisfied: beautifulsoup4 in c:\users\shuos\anaconda3\lib\site-packages (from bs4) (4.12.3)
Requirement already satisfied: soupsieve>1.2 in c:\users\shuos\anaconda3\lib\site-packages (from beautifulsoup4->bs4) (2.5)
Note: you may need to restart the kernel to use updated packages.
In [4]:
pip install pandas
Requirement already satisfied: pandas in c:\users\shuos\anaconda3\lib\site-packages (2.2.2)
Requirement already satisfied: numpy>=1.26.0 in c:\users\shuos\anaconda3\lib\site-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\shuos\anaconda3\lib\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\shuos\anaconda3\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\shuos\anaconda3\lib\site-packages (from pandas) (2023.3)
Requirement already satisfied: six>=1.5 in c:\users\shuos\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [5]:
from bs4 import BeautifulSoupimport requestsimport reimport pandas as pd
movies = soup.select('td.titleColumn')crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]ratings = [b.attrs.get('data-value')for b in soup.select('td.posterColumn span[name=ir]')]
In [8]:
# create a empty list for storing# movie informationlist= []# Iterating over movies to extract# each movie's detailsfor index inrange(0, len(movies)):# Separating movie into: 'place',# 'title', 'year' movie_string = movies[index].get_text() movie = (' '.join(movie_string.split()).replace('.', '')) movie_title = movie[len(str(index))+1:-7] year = re.search('\((.*?)\)', movie_string).group(1) place = movie[:len(str(index))-(len(movie))] data = {"place": place,"movie_title": movie_title,"rating": ratings[index],"year": year,"star_cast": crew[index], }list.append(data)
for movie inlist:print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +') -', 'Starring:', movie['star_cast'], movie['rating'])
In [10]:
#saving the list as dataframe#then converting into .csv filedf = pd.DataFrame(list)df.to_csv('imdb_top_250_movies.csv',index=False)
In [11]:
from bs4 import BeautifulSoupimport requestsimport reimport pandas as pd# Downloading imdb top 250 movie's dataurl ='http://www.imdb.com/chart/top'response = requests.get(url)soup = BeautifulSoup(response.text, "html.parser")movies = soup.select('td.titleColumn')crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]ratings = [b.attrs.get('data-value')for b in soup.select('td.posterColumn span[name=ir]')]# create a empty list for storing# movie informationlist= []# Iterating over movies to extract# each movie's detailsfor index inrange(0, len(movies)):# Separating movie into: 'place',# 'title', 'year' movie_string = movies[index].get_text() movie = (' '.join(movie_string.split()).replace('.', '')) movie_title = movie[len(str(index))+1:-7] year = re.search('\((.*?)\)', movie_string).group(1) place = movie[:len(str(index))-(len(movie))] data = {"place": place,"movie_title": movie_title,"rating": ratings[index],"year": year,"star_cast": crew[index], }list.append(data)# printing movie details with its rating.for movie inlist:print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +') -', 'Starring:', movie['star_cast'], movie['rating'])##.......##df = pd.DataFrame(list)df.to_csv('imdb_top_250_movies.csv',index=False)